All packages used for analysis and figures in this page:
# General data wrangling
library(tidyverse)
library(knitr)
library(kableExtra)
library(DT)
library(readxl)
# Data visualization
library(plotly)
library(ggcorrplot)
library(psych)
library(GGally)
library(gridExtra)
library(factoextra)
library(FactoMineR)
# ggseg is used to visualize the brain
# remotes::install_github("LCBC-UiO/ggseg")
# If that doesn't work:
# download.file("https://github.com/LCBC-UiO/ggseg/archive/master.zip", "ggseg.zip")
# unzip("ggseg.zip")
# devtools::install_local("ggseg-master")
library(ggseg)
# remotes::install_github("LCBC-UiO/ggseg3d")
library(ggseg3d)
# remotes::install_github("LCBC-UiO/ggsegExtra")
library(ggsegExtra)
First, I’ll load the partial volume-corrected regional tau-PET data from ADNI. For more info on this dataset, please see Data Understanding and Acknowledgments.
# This file is on my local machine. Please refer to above paragraph for how to download this exact dataset.
tau.df <- read.csv("../../ADNI_Data/Raw_Data/UCBERKELEYAV1451_PVC_05_12_20.csv")
tau.df$EXAMDATE = as.Date(tau.df$EXAMDATE, format="%m/%d/%Y")
# update stamp is irrelevant, drop it
tau.df <- select(tau.df, -update_stamp)
# I also apply partial encryption to obscure the subject ID and exam date in presentation here
source("../encrypt_df.R")
tau.df <- encrypt_pet_real(tau.df)
Those without access to ADNI to download the same dataset can use the simulated dataset included in this GitHub repo main directory, “Simulated_ADNI_TauPET.csv”.
tau.df <- read.csv("https://github.com/anniegbryant/DA5030_Final_Project/raw/master/Simulated_ADNI_TauPET.csv")
I’ll filter this tau-PET data to contain only subjects with 2+ tau-PET scans, and omit irrelevant columns:
tau.df <- tau.df %>%
# Omit irrelevant columns
select(-VISCODE, -HEMIWM_SUVR, -BRAAK12_SUVR,
-BRAAK34_SUVR, -BRAAK56_SUVR, -OTHER_SUVR) %>%
# Don't include volumetric data columns
select(!matches("VOLUME")) %>%
group_by(RID)
# remove _SUVR from column names
colnames(tau.df) <- str_replace_all(colnames(tau.df), "_SUVR", "")
str(tau.df)
## tibble [1,120 x 78] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ RID : num [1:1120] 6741 9951 9951 17976 17976 ...
## $ VISCODE2 : chr [1:1120] "m144" "m144" "m156" "m144" ...
## $ EXAMDATE : Date[1:1120], format: "2018-02-16" "2018-05-08" "2019-05-07" ...
## $ INFERIOR_CEREBGM : num [1:1120] 1.32 1.33 1.33 1.28 1.24 ...
## $ BRAIN_STEM : num [1:1120] 1.27 1.12 1.12 1.2 1.17 ...
## $ LEFT_MIDDLEFR : num [1:1120] 2.02 1.93 1.8 1.83 1.78 ...
## $ LEFT_ORBITOFR : num [1:1120] 2.17 2.03 1.92 2.11 1.98 ...
## $ LEFT_PARSFR : num [1:1120] 2.02 2.01 1.98 2.03 1.99 ...
## $ LEFT_ACCUMBENS_AREA : num [1:1120] 1.14 1.04 1.79 1.12 1.18 ...
## $ LEFT_AMYGDALA : num [1:1120] 1.31 1.54 1.63 1.42 1.37 ...
## $ LEFT_CAUDATE : num [1:1120] 2.08 1.46 1.34 1.95 1.83 ...
## $ LEFT_HIPPOCAMPUS : num [1:1120] 2.12 1.96 2.2 1.69 1.73 ...
## $ LEFT_PALLIDUM : num [1:1120] 3.79 1.89 1.95 2.5 2.6 ...
## $ LEFT_PUTAMEN : num [1:1120] 1.69 1.64 1.42 1.9 1.78 ...
## $ LEFT_THALAMUS_PROPER : num [1:1120] 1.45 1.32 1.24 1.54 1.53 ...
## $ RIGHT_MIDDLEFR : num [1:1120] 2.08 1.91 1.8 1.94 1.85 ...
## $ RIGHT_ORBITOFR : num [1:1120] 2.19 2.01 1.86 2.17 2.03 ...
## $ RIGHT_PARSFR : num [1:1120] 2.17 2.08 1.9 2.09 2.01 ...
## $ RIGHT_ACCUMBENS_AREA : num [1:1120] 1.41 1.65 1.66 1.01 1.07 ...
## $ RIGHT_AMYGDALA : num [1:1120] 1.18 1.79 1.89 1.37 1.44 ...
## $ RIGHT_CAUDATE : num [1:1120] 2.01 1.57 1.37 1.96 1.89 ...
## $ RIGHT_HIPPOCAMPUS : num [1:1120] 2.01 2.09 2.03 1.62 1.64 ...
## $ RIGHT_PALLIDUM : num [1:1120] 3.01 2.32 2.12 2.33 2.48 ...
## $ RIGHT_PUTAMEN : num [1:1120] 1.68 1.62 1.53 2.06 1.94 ...
## $ RIGHT_THALAMUS_PROPER : num [1:1120] 1.42 1.33 1.24 1.52 1.55 ...
## $ CHOROID : num [1:1120] 7.45 4.56 4.31 3.84 3.79 ...
## $ CTX_LH_BANKSSTS : num [1:1120] 1.75 1.49 1.6 1.7 1.63 ...
## $ CTX_LH_CAUDALANTERIORCINGULATE : num [1:1120] 1.67 1.73 1.65 1.69 1.69 ...
## $ CTX_LH_CUNEUS : num [1:1120] 2.33 2.2 2.05 2.01 2 ...
## $ CTX_LH_ENTORHINAL : num [1:1120] 2.07 2.3 2.43 2.79 2.52 ...
## $ CTX_LH_FUSIFORM : num [1:1120] 1.97 1.87 1.83 1.84 1.77 ...
## $ CTX_LH_INFERIORPARIETAL : num [1:1120] 1.99 1.95 1.94 1.85 1.89 ...
## $ CTX_LH_INFERIORTEMPORAL : num [1:1120] 2.16 1.97 2.05 2.1 2.02 ...
## $ CTX_LH_INSULA : num [1:1120] 1.51 1.64 1.65 1.51 1.48 ...
## $ CTX_LH_ISTHMUSCINGULATE : num [1:1120] 1.9 1.81 1.82 1.79 1.94 ...
## $ CTX_LH_LATERALOCCIPITAL : num [1:1120] 2.39 2.06 1.99 1.92 2 ...
## $ CTX_LH_LINGUAL : num [1:1120] 2.27 1.95 1.97 1.76 1.74 ...
## $ CTX_LH_MIDDLETEMPORAL : num [1:1120] 2.2 2.06 1.89 2.04 1.99 ...
## $ CTX_LH_PARACENTRAL : num [1:1120] 1.99 1.79 1.8 1.91 1.8 ...
## $ CTX_LH_PARAHIPPOCAMPAL : num [1:1120] 1.6 1.86 1.92 1.72 1.66 ...
## $ CTX_LH_PERICALCARINE : num [1:1120] 2.23 1.45 1.41 1.56 1.54 ...
## $ CTX_LH_POSTCENTRAL : num [1:1120] 2.03 1.81 1.82 1.85 1.78 ...
## $ CTX_LH_POSTERIORCINGULATE : num [1:1120] 1.82 1.89 1.84 1.72 1.67 ...
## $ CTX_LH_PRECENTRAL : num [1:1120] 1.91 1.85 1.75 1.62 1.61 ...
## $ CTX_LH_PRECUNEUS : num [1:1120] 1.93 1.89 1.94 1.81 1.81 ...
## $ CTX_LH_ROSTRALANTERIORCINGULATE: num [1:1120] 1.71 1.58 1.49 1.59 1.48 ...
## $ CTX_LH_SUPERIORFRONTAL : num [1:1120] 1.86 1.86 1.74 1.84 1.77 ...
## $ CTX_LH_SUPERIORPARIETAL : num [1:1120] 1.88 1.99 1.97 1.9 1.87 ...
## $ CTX_LH_SUPERIORTEMPORAL : num [1:1120] 2.06 1.92 1.83 1.85 1.76 ...
## $ CTX_LH_SUPRAMARGINAL : num [1:1120] 1.93 1.83 1.74 1.81 1.81 ...
## $ CTX_LH_TEMPORALPOLE : num [1:1120] 2.23 2.05 1.87 1.96 1.8 ...
## $ CTX_LH_TRANSVERSETEMPORAL : num [1:1120] 1.75 1.73 1.72 1.67 1.47 ...
## $ CTX_RH_BANKSSTS : num [1:1120] 1.78 1.85 1.68 1.68 1.65 ...
## $ CTX_RH_CAUDALANTERIORCINGULATE : num [1:1120] 1.75 1.78 1.66 1.76 1.8 ...
## $ CTX_RH_CUNEUS : num [1:1120] 2.45 2.25 2.19 2.13 2.11 ...
## $ CTX_RH_ENTORHINAL : num [1:1120] 1.99 3.74 3.46 2.52 2.34 ...
## $ CTX_RH_FUSIFORM : num [1:1120] 1.98 1.79 1.75 1.8 1.71 ...
## $ CTX_RH_INFERIORPARIETAL : num [1:1120] 1.96 1.93 1.82 1.81 1.82 ...
## $ CTX_RH_INFERIORTEMPORAL : num [1:1120] 2.25 1.97 1.81 2.01 1.98 ...
## $ CTX_RH_INSULA : num [1:1120] 1.49 1.61 1.48 1.58 1.45 ...
## $ CTX_RH_ISTHMUSCINGULATE : num [1:1120] 2.03 1.86 1.85 1.85 1.81 ...
## $ CTX_RH_LATERALOCCIPITAL : num [1:1120] 2.27 1.9 1.89 1.86 1.92 ...
## $ CTX_RH_LINGUAL : num [1:1120] 2.12 2.07 1.88 1.87 1.83 ...
## $ CTX_RH_MIDDLETEMPORAL : num [1:1120] 2.2 1.85 1.81 2 1.94 ...
## $ CTX_RH_PARACENTRAL : num [1:1120] 1.82 1.79 1.63 1.89 1.89 ...
## $ CTX_RH_PARAHIPPOCAMPAL : num [1:1120] 1.52 1.94 1.94 1.84 1.74 ...
## $ CTX_RH_PERICALCARINE : num [1:1120] 2.02 2.07 2 1.6 1.62 ...
## $ CTX_RH_POSTCENTRAL : num [1:1120] 1.91 1.9 1.74 1.81 1.76 ...
## $ CTX_RH_POSTERIORCINGULATE : num [1:1120] 1.79 1.79 1.68 1.68 1.77 ...
## $ CTX_RH_PRECENTRAL : num [1:1120] 1.72 1.87 1.77 1.71 1.71 ...
## $ CTX_RH_PRECUNEUS : num [1:1120] 1.86 1.82 1.75 1.85 1.86 ...
## $ CTX_RH_ROSTRALANTERIORCINGULATE: num [1:1120] 1.78 1.63 1.57 1.52 1.49 ...
## $ CTX_RH_SUPERIORFRONTAL : num [1:1120] 1.86 1.9 1.81 1.82 1.79 ...
## $ CTX_RH_SUPERIORPARIETAL : num [1:1120] 1.78 2 1.9 1.93 1.92 ...
## $ CTX_RH_SUPERIORTEMPORAL : num [1:1120] 1.97 1.92 1.82 1.89 1.83 ...
## $ CTX_RH_SUPRAMARGINAL : num [1:1120] 1.8 1.81 1.76 1.8 1.79 ...
## $ CTX_RH_TEMPORALPOLE : num [1:1120] 2.29 2.3 2.21 1.98 1.75 ...
## $ CTX_RH_TRANSVERSETEMPORAL : num [1:1120] 1.96 2.14 1.91 1.62 1.55 ...
## - attr(*, "groups")= tibble [776 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ RID : num [1:776] 6741 9951 17976 18939 22149 ...
## ..$ .rows: list<int> [1:776]
## .. ..$ : int 1
## .. ..$ : int [1:2] 2 3
## .. ..$ : int [1:3] 4 5 6
## .. ..$ : int 7
## .. ..$ : int [1:3] 8 9 10
## .. ..$ : int [1:3] 11 12 13
## .. ..$ : int [1:3] 14 15 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int [1:3] 24 25 26
## .. ..$ : int 27
## .. ..$ : int [1:2] 28 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int [1:2] 32 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int [1:2] 36 37
## .. ..$ : int 38
## .. ..$ : int [1:2] 39 40
## .. ..$ : int [1:2] 41 42
## .. ..$ : int 43
## .. ..$ : int [1:2] 44 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int [1:3] 48 49 50
## .. ..$ : int [1:2] 51 52
## .. ..$ : int [1:3] 53 54 55
## .. ..$ : int 56
## .. ..$ : int [1:2] 57 58
## .. ..$ : int 59
## .. ..$ : int 60
## .. ..$ : int 61
## .. ..$ : int 62
## .. ..$ : int 63
## .. ..$ : int 64
## .. ..$ : int 65
## .. ..$ : int 66
## .. ..$ : int 67
## .. ..$ : int [1:4] 68 69 70 71
## .. ..$ : int 72
## .. ..$ : int [1:2] 73 74
## .. ..$ : int [1:2] 75 76
## .. ..$ : int 77
## .. ..$ : int 78
## .. ..$ : int [1:3] 79 80 81
## .. ..$ : int [1:3] 82 83 84
## .. ..$ : int 85
## .. ..$ : int 86
## .. ..$ : int 87
## .. ..$ : int [1:2] 88 89
## .. ..$ : int 90
## .. ..$ : int 91
## .. ..$ : int 92
## .. ..$ : int [1:3] 93 94 95
## .. ..$ : int [1:2] 96 97
## .. ..$ : int 98
## .. ..$ : int [1:2] 99 100
## .. ..$ : int 101
## .. ..$ : int 102
## .. ..$ : int 103
## .. ..$ : int 104
## .. ..$ : int [1:5] 105 106 107 108 109
## .. ..$ : int 110
## .. ..$ : int [1:3] 111 112 113
## .. ..$ : int [1:2] 114 115
## .. ..$ : int 116
## .. ..$ : int [1:4] 117 118 119 120
## .. ..$ : int [1:2] 121 122
## .. ..$ : int [1:4] 123 124 125 126
## .. ..$ : int [1:3] 127 128 129
## .. ..$ : int 130
## .. ..$ : int 131
## .. ..$ : int 132
## .. ..$ : int 133
## .. ..$ : int 134
## .. ..$ : int [1:2] 135 136
## .. ..$ : int 137
## .. ..$ : int 138
## .. ..$ : int [1:2] 139 140
## .. ..$ : int 141
## .. ..$ : int 142
## .. ..$ : int 143
## .. ..$ : int [1:3] 144 145 146
## .. ..$ : int [1:2] 147 148
## .. ..$ : int 149
## .. ..$ : int [1:2] 150 151
## .. ..$ : int 152
## .. ..$ : int 153
## .. ..$ : int 154
## .. ..$ : int 155
## .. ..$ : int 156
## .. ..$ : int 157
## .. ..$ : int 158
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
As shown in Data Understanding, the ROIs are not precisely standardized to the inferior cerebellum gray matter SUVR. I will re-standardize each region’s ROI SUVR values here.
# tau.stand = tau-PET dataframe with ROI SUVR values re-standardized to inferior cerebellum gray matter SUVR
tau.stand <- tau.df
# iterate over all ROI columns and standardize to inferior cerebellum gray -- tau.df[4]
for (i in 4:ncol(tau.stand)) {
tau.stand[i] <- tau.stand[i]/ tau.df[4]
}
rm(tau.df)
Standardization can be verified using summary:
summary(tau.stand$INFERIOR_CEREBGM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
Now that regional SUVR is properly standardized, the next step is to select brain regions based on a priori knowledge of where and how tau affects the brain in MCI/AD. I am going to stratify the cortical parcellations and subcortical segmentations based on Schöll et al. (2016) and per UCSF’s recommendations for usage of their tau-PET data. Here is the stratification across the Braak stages:
# Display the UCSF & Scholl 2016 ROI Braak stage datatable
roi.braak <- read.csv("../RData/roi_braak_stages.csv") %>%
mutate(ROI_Name = tolower(ROI_Name)) %>%
mutate(Hemisphere = ifelse(str_detect(ROI_Name, "rh_|right"), "Right", "Left"))
datatable(roi.braak %>% select(-Base_Name))
The following plots show the spatial relationship of the Braak stages in the brain, in the cortical (top) and subcortical (bottom) ROIs:
# Load dataframes that link the ADNI tau-PET ROIs with nomenclature used in the ggseg package
ggseg.aparc <- read_excel("../RData/ggseg_roi.xlsx", sheet=1) %>%
mutate(Braak=as.numeric(as.roman(Braak)))
ggseg.aseg <- read_excel("../RData/ggseg_roi.xlsx", sheet=2) %>%
mutate(Braak=as.numeric(as.roman(Braak)))
# Plot the Desikan-Killiany cortical parcellation atlas
p.aparc <- dk %>% filter(hemi=="right") %>%
unnest(ggseg) %>%
select(region) %>%
na.omit() %>%
distinct() %>%
left_join(., ggseg.aparc, by=c("region"="ggseg_ROI")) %>%
mutate(Braak = as.character(Braak)) %>%
# ggseg: dk=Desikan-Killiany atlas, fill by Braak region, label by ROI name
ggseg(atlas="dk", mapping=aes(fill=Braak, label=region)) +
scale_fill_manual(values=c("#F8766D", "#A3A617", "#00BF7D",
"#00B0F6", "#E76BF3"), na.value="gray80") +
theme(axis.text.y=element_blank(),
axis.text.x = element_text(family="calibri"),
axis.title.x = element_text(family="calibri"),
legend.title=element_text(family="calibri"),
legend.text=element_text(family="calibri"))
# Convert to plotly interactive visualization
ggplotly(p.aparc, tooltip = c("fill", "label"), width=800, height=300)
# Plot the FreeSurfer subcortical segmentation atlas
p.aseg <- aseg %>% filter(hemi=="right") %>%
unnest(ggseg) %>%
select(region) %>%
na.omit() %>%
distinct() %>%
left_join(., ggseg.aseg, by=c("region"="ggseg_ROI")) %>%
mutate(Braak = as.character(Braak)) %>%
# ggseg: aseg=subcortical segmentation atlas, fill by Braak region, label by ROI name
ggseg(atlas="aseg", mapping=aes(fill=Braak, label=region)) +
scale_fill_manual(values=c("deeppink", "#A3A617"), na.value="gray80") +
theme(axis.text.y=element_blank(),
axis.text.x = element_text(family="calibri"),
axis.title.x = element_text(family="calibri"),
legend.title=element_text(family="calibri"),
legend.text=element_text(family="calibri"))
# Convert to plotly interactive visualization
ggplotly(p.aseg, tooltip=c("fill", "label"), width=800, height=300)
I will filter the tau-PET dataset to only include SUVR data for ROIs detailed in the above list, by first reshaping the tau-PET SUVR data from wide to long. Then, I will merge left and right hemisphere ROIs into one bilateral ROI by taking the mean SUVR.
# ADNI tau-PET data includes other ROIs outside of this Braak stage dataset; for this study, I'm not looking at those
tau.stand.roi <- tau.stand %>%
pivot_longer(., cols=c(-RID, -VISCODE2, -EXAMDATE), names_to="ROI_Name", values_to="SUVR") %>%
mutate(ROI_Name=tolower(ROI_Name)) %>%
# Only keep ROIs included in Braak stage stratifcation via semi-join
semi_join(., roi.braak) %>%
# Once dataset has been filtered, add columns containing Braak stage and cortical lobe
left_join(., roi.braak) %>%
# Remove right/left distinction from ROI name
mutate(ROI_Name = str_replace_all(ROI_Name, "right_|left_|ctx_rh_|ctx_lh_", "")) %>%
# Group by bilateral ROI -- e.g. "hippocampus" which contains left and right hippocampus
dplyr::group_by(RID, VISCODE2, EXAMDATE, ROI_Name, Braak) %>%
# Calculate ROI average SUVR
dplyr::summarise(SUVR = mean(SUVR, na.rm=T))
data.frame(ROI=unique(tau.stand.roi$ROI_Name)) %>%
left_join(., roi.braak, by=c("ROI"="Base_Name")) %>%
select(-ROI_Name, -Hemisphere) %>%
distinct() %>%
datatable()
Now, I will re-shape the tau-PET data back to wide to be compatible with the cognitive status data shape.
# Reshape wide --> long to be merged with CDR-SoB cognitive data
tau.stand.roi <- tau.stand.roi %>%
select(-Braak) %>%
pivot_wider(id_cols=c(RID, VISCODE2, EXAMDATE), names_from="ROI_Name",
values_from="SUVR")
str(tau.stand.roi)
## tibble [1,120 x 34] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ RID : num [1:1120] 6741 9951 9951 17976 17976 ...
## $ VISCODE2 : chr [1:1120] "m144" "m144" "m156" "m144" ...
## $ EXAMDATE : Date[1:1120], format: "2018-02-16" "2018-05-08" "2019-05-07" ...
## $ amygdala : num [1:1120] 0.943 1.254 1.327 1.09 1.127 ...
## $ bankssts : num [1:1120] 1.34 1.26 1.23 1.32 1.32 ...
## $ caudalanteriorcingulate : num [1:1120] 1.29 1.32 1.25 1.35 1.4 ...
## $ cuneus : num [1:1120] 1.81 1.67 1.6 1.62 1.65 ...
## $ entorhinal : num [1:1120] 1.54 2.28 2.22 2.07 1.95 ...
## $ fusiform : num [1:1120] 1.5 1.38 1.35 1.42 1.4 ...
## $ hippocampus : num [1:1120] 1.56 1.52 1.59 1.29 1.36 ...
## $ inferiorparietal : num [1:1120] 1.5 1.46 1.42 1.43 1.49 ...
## $ inferiortemporal : num [1:1120] 1.67 1.48 1.45 1.61 1.61 ...
## $ insula : num [1:1120] 1.14 1.22 1.18 1.21 1.18 ...
## $ isthmuscingulate : num [1:1120] 1.49 1.38 1.38 1.42 1.51 ...
## $ lateraloccipital : num [1:1120] 1.77 1.49 1.46 1.47 1.58 ...
## $ lingual : num [1:1120] 1.66 1.52 1.45 1.42 1.43 ...
## $ middlefr : num [1:1120] 1.55 1.44 1.36 1.48 1.46 ...
## $ middletemporal : num [1:1120] 1.67 1.47 1.39 1.58 1.58 ...
## $ orbitofr : num [1:1120] 1.65 1.52 1.43 1.67 1.61 ...
## $ paracentral : num [1:1120] 1.44 1.35 1.3 1.48 1.48 ...
## $ parahippocampal : num [1:1120] 1.18 1.43 1.45 1.39 1.37 ...
## $ parsfr : num [1:1120] 1.59 1.54 1.46 1.61 1.61 ...
## $ pericalcarine : num [1:1120] 1.61 1.33 1.29 1.23 1.27 ...
## $ postcentral : num [1:1120] 1.49 1.4 1.34 1.43 1.42 ...
## $ posteriorcingulate : num [1:1120] 1.37 1.39 1.33 1.33 1.38 ...
## $ precentral : num [1:1120] 1.37 1.4 1.33 1.3 1.34 ...
## $ precuneus : num [1:1120] 1.43 1.4 1.39 1.43 1.48 ...
## $ rostralanteriorcingulate: num [1:1120] 1.32 1.21 1.15 1.21 1.2 ...
## $ superiorfrontal : num [1:1120] 1.41 1.42 1.34 1.43 1.43 ...
## $ superiorparietal : num [1:1120] 1.39 1.5 1.46 1.49 1.52 ...
## $ superiortemporal : num [1:1120] 1.53 1.45 1.38 1.46 1.44 ...
## $ supramarginal : num [1:1120] 1.41 1.37 1.32 1.41 1.45 ...
## $ temporalpole : num [1:1120] 1.71 1.64 1.54 1.54 1.43 ...
## $ transversetemporal : num [1:1120] 1.41 1.46 1.37 1.28 1.21 ...
## - attr(*, "groups")= tibble [1,120 x 4] (S3: tbl_df/tbl/data.frame)
## ..$ RID : num [1:1120] 6741 9951 9951 17976 17976 ...
## ..$ VISCODE2: chr [1:1120] "m144" "m144" "m156" "m144" ...
## ..$ EXAMDATE: Date[1:1120], format: "2018-02-16" "2018-05-08" "2019-05-07" ...
## ..$ .rows : list<int> [1:1120]
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..$ : int 59
## .. ..$ : int 60
## .. ..$ : int 61
## .. ..$ : int 62
## .. ..$ : int 63
## .. ..$ : int 64
## .. ..$ : int 65
## .. ..$ : int 66
## .. ..$ : int 67
## .. ..$ : int 68
## .. ..$ : int 69
## .. ..$ : int 70
## .. ..$ : int 71
## .. ..$ : int 72
## .. ..$ : int 73
## .. ..$ : int 74
## .. ..$ : int 75
## .. ..$ : int 76
## .. ..$ : int 77
## .. ..$ : int 78
## .. ..$ : int 79
## .. ..$ : int 80
## .. ..$ : int 81
## .. ..$ : int 82
## .. ..$ : int 83
## .. ..$ : int 84
## .. ..$ : int 85
## .. ..$ : int 86
## .. ..$ : int 87
## .. ..$ : int 88
## .. ..$ : int 89
## .. ..$ : int 90
## .. ..$ : int 91
## .. ..$ : int 92
## .. ..$ : int 93
## .. ..$ : int 94
## .. ..$ : int 95
## .. ..$ : int 96
## .. ..$ : int 97
## .. ..$ : int 98
## .. ..$ : int 99
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
ADNI compiled a merged dataset containing key information from several tables, including subject demographics, selected cognitive assessment scores, and select biomarker data.
I am interested in the following features in this dataset:
RID: Participant roster ID, which serves as unique subject identifierVISCODE: Visit codeEXAMDATE: DateAGE: Age at visitPTGENDER: Biological sexCDRSB: CDR Sum-of-Boxes score at visit# Load subject demographic information + cognitive assessment dataset
# NOTE: this data is on my local machine as it is the real ADNI data
subj.info <- read.csv("../../ADNI_Data/Raw_Data/ADNIMERGE.csv", stringsAsFactors = T, na.strings="")
subj.info <- subj.info
Those without access to ADNI to download the same dataset can use the simulated dataset included in this GitHub repo main directory, “Simulated_ADNI_cognitive_scores.csv”.
subj.info <- read.csv("https://github.com/anniegbryant/DA5030_Final_Project/raw/master/Simulated_ADNI_cognitive_scores.csv") %>% select(RID, VISCODE, AGE, PTGENDER, CDRSB)
To keep this data consistent with the tau-PET data, I will also partially encrypt this dataset to obscure the subject identifier and exam date. All other data will be kept as-is for this analysis.
source("../encrypt_df.R")
subj.info <- encrypt_subj_real(subj.info) %>% select(RID, VISCODE, AGE, PTGENDER, CDRSB)
I actually can’t join the two datasets on the EXAMDATE feature, as these sometimes differ by one or two days depending on when the records were entered. Instead, I will join by the RID subject identifier and VISCODE, a visit code identifier.
# full.df = merged tau-PET data and cognitive assessment data
full.df <- inner_join(tau.stand.roi, subj.info, by=c("RID", "VISCODE2"="VISCODE")) %>%
filter(!is.na(CDRSB)) %>%
group_by(RID) %>%
dplyr::mutate(n_visits = n()) %>%
# Only keep subjects with 2+ PET scans and cognitive assessments
filter(n_visits>1) %>%
select(-n_visits)
Click to see the structure of this merged dataset:
str(full.df)
## tibble [576 x 37] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ RID : num [1:576] 9951 9951 17976 17976 17976 ...
## $ VISCODE2 : chr [1:576] "m144" "m156" "m144" "m156" ...
## $ EXAMDATE : Date[1:576], format: "2018-05-08" "2019-05-07" "2018-03-06" ...
## $ amygdala : num [1:576] 1.25 1.33 1.09 1.13 1.05 ...
## $ bankssts : num [1:576] 1.26 1.23 1.32 1.32 1.3 ...
## $ caudalanteriorcingulate : num [1:576] 1.32 1.25 1.35 1.4 1.21 ...
## $ cuneus : num [1:576] 1.67 1.6 1.62 1.65 1.56 ...
## $ entorhinal : num [1:576] 2.28 2.22 2.07 1.95 1.98 ...
## $ fusiform : num [1:576] 1.38 1.35 1.42 1.4 1.34 ...
## $ hippocampus : num [1:576] 1.52 1.59 1.29 1.36 1.34 ...
## $ inferiorparietal : num [1:576] 1.46 1.42 1.43 1.49 1.45 ...
## $ inferiortemporal : num [1:576] 1.48 1.45 1.61 1.61 1.58 ...
## $ insula : num [1:576] 1.22 1.18 1.21 1.18 1.08 ...
## $ isthmuscingulate : num [1:576] 1.38 1.38 1.42 1.51 1.36 ...
## $ lateraloccipital : num [1:576] 1.49 1.46 1.47 1.58 1.59 ...
## $ lingual : num [1:576] 1.52 1.45 1.42 1.43 1.38 ...
## $ middlefr : num [1:576] 1.44 1.36 1.48 1.46 1.42 ...
## $ middletemporal : num [1:576] 1.47 1.39 1.58 1.58 1.56 ...
## $ orbitofr : num [1:576] 1.52 1.43 1.67 1.61 1.5 ...
## $ paracentral : num [1:576] 1.35 1.3 1.48 1.48 1.41 ...
## $ parahippocampal : num [1:576] 1.43 1.45 1.39 1.37 1.29 ...
## $ parsfr : num [1:576] 1.54 1.46 1.61 1.61 1.5 ...
## $ pericalcarine : num [1:576] 1.33 1.29 1.23 1.27 1.22 ...
## $ postcentral : num [1:576] 1.4 1.34 1.43 1.42 1.34 ...
## $ posteriorcingulate : num [1:576] 1.39 1.33 1.33 1.38 1.25 ...
## $ precentral : num [1:576] 1.4 1.33 1.3 1.34 1.25 ...
## $ precuneus : num [1:576] 1.4 1.39 1.43 1.48 1.37 ...
## $ rostralanteriorcingulate: num [1:576] 1.21 1.15 1.21 1.2 1.06 ...
## $ superiorfrontal : num [1:576] 1.42 1.34 1.43 1.43 1.35 ...
## $ superiorparietal : num [1:576] 1.5 1.46 1.49 1.52 1.43 ...
## $ superiortemporal : num [1:576] 1.45 1.38 1.46 1.44 1.37 ...
## $ supramarginal : num [1:576] 1.37 1.32 1.41 1.45 1.37 ...
## $ temporalpole : num [1:576] 1.64 1.54 1.54 1.43 1.47 ...
## $ transversetemporal : num [1:576] 1.46 1.37 1.28 1.21 1.06 ...
## $ AGE : num [1:576] 77.7 77.7 69.6 69.6 69.6 72.9 72.9 72.9 79.6 79.6 ...
## $ PTGENDER : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 2 2 2 ...
## $ CDRSB : num [1:576] 0 0 0.5 0.5 0 0.5 0 0.5 0 0 ...
## - attr(*, "groups")= tibble [243 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ RID : num [1:243] 9951 17976 22149 30816 35952 ...
## ..$ .rows: list<int> [1:243]
## .. ..$ : int [1:2] 1 2
## .. ..$ : int [1:3] 3 4 5
## .. ..$ : int [1:3] 6 7 8
## .. ..$ : int [1:3] 9 10 11
## .. ..$ : int [1:3] 12 13 14
## .. ..$ : int [1:3] 15 16 17
## .. ..$ : int [1:2] 18 19
## .. ..$ : int [1:2] 20 21
## .. ..$ : int [1:2] 22 23
## .. ..$ : int [1:2] 24 25
## .. ..$ : int [1:2] 26 27
## .. ..$ : int [1:2] 28 29
## .. ..$ : int [1:3] 30 31 32
## .. ..$ : int [1:2] 33 34
## .. ..$ : int [1:3] 35 36 37
## .. ..$ : int [1:2] 38 39
## .. ..$ : int [1:4] 40 41 42 43
## .. ..$ : int [1:2] 44 45
## .. ..$ : int [1:3] 46 47 48
## .. ..$ : int [1:3] 49 50 51
## .. ..$ : int [1:3] 52 53 54
## .. ..$ : int [1:2] 55 56
## .. ..$ : int [1:2] 57 58
## .. ..$ : int [1:4] 59 60 61 62
## .. ..$ : int [1:3] 63 64 65
## .. ..$ : int [1:2] 66 67
## .. ..$ : int [1:4] 68 69 70 71
## .. ..$ : int [1:2] 72 73
## .. ..$ : int [1:4] 74 75 76 77
## .. ..$ : int [1:3] 78 79 80
## .. ..$ : int [1:2] 81 82
## .. ..$ : int [1:2] 83 84
## .. ..$ : int [1:3] 85 86 87
## .. ..$ : int [1:2] 88 89
## .. ..$ : int [1:2] 90 91
## .. ..$ : int [1:3] 92 93 94
## .. ..$ : int [1:2] 95 96
## .. ..$ : int [1:4] 97 98 99 100
## .. ..$ : int [1:2] 101 102
## .. ..$ : int [1:2] 103 104
## .. ..$ : int [1:2] 105 106
## .. ..$ : int [1:2] 107 108
## .. ..$ : int [1:3] 109 110 111
## .. ..$ : int [1:3] 112 113 114
## .. ..$ : int [1:4] 115 116 117 118
## .. ..$ : int [1:2] 119 120
## .. ..$ : int [1:2] 121 122
## .. ..$ : int [1:2] 123 124
## .. ..$ : int [1:2] 125 126
## .. ..$ : int [1:3] 127 128 129
## .. ..$ : int [1:3] 130 131 132
## .. ..$ : int [1:2] 133 134
## .. ..$ : int [1:2] 135 136
## .. ..$ : int [1:2] 137 138
## .. ..$ : int [1:2] 139 140
## .. ..$ : int [1:2] 141 142
## .. ..$ : int [1:3] 143 144 145
## .. ..$ : int [1:2] 146 147
## .. ..$ : int [1:2] 148 149
## .. ..$ : int [1:3] 150 151 152
## .. ..$ : int [1:4] 153 154 155 156
## .. ..$ : int [1:2] 157 158
## .. ..$ : int [1:2] 159 160
## .. ..$ : int [1:2] 161 162
## .. ..$ : int [1:3] 163 164 165
## .. ..$ : int [1:2] 166 167
## .. ..$ : int [1:3] 168 169 170
## .. ..$ : int [1:2] 171 172
## .. ..$ : int [1:2] 173 174
## .. ..$ : int [1:3] 175 176 177
## .. ..$ : int [1:2] 178 179
## .. ..$ : int [1:2] 180 181
## .. ..$ : int [1:3] 182 183 184
## .. ..$ : int [1:2] 185 186
## .. ..$ : int [1:3] 187 188 189
## .. ..$ : int [1:3] 190 191 192
## .. ..$ : int [1:2] 193 194
## .. ..$ : int [1:3] 195 196 197
## .. ..$ : int [1:4] 198 199 200 201
## .. ..$ : int [1:2] 202 203
## .. ..$ : int [1:2] 204 205
## .. ..$ : int [1:2] 206 207
## .. ..$ : int [1:2] 208 209
## .. ..$ : int [1:2] 210 211
## .. ..$ : int [1:2] 212 213
## .. ..$ : int [1:2] 214 215
## .. ..$ : int [1:2] 216 217
## .. ..$ : int [1:2] 218 219
## .. ..$ : int [1:4] 220 221 222 223
## .. ..$ : int [1:2] 224 225
## .. ..$ : int [1:4] 226 227 228 229
## .. ..$ : int [1:2] 230 231
## .. ..$ : int [1:2] 232 233
## .. ..$ : int [1:3] 234 235 236
## .. ..$ : int [1:3] 237 238 239
## .. ..$ : int [1:2] 240 241
## .. ..$ : int [1:2] 242 243
## .. ..$ : int [1:4] 244 245 246 247
## .. ..$ : int [1:3] 248 249 250
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
cat("\nNumber of longitudinal tau-PET scans with accompanying cognitive data: **\n",
nrow(full.df), "**\nNumber of subjects in merged dataset: **",
length(unique(full.df$RID)), "**\n", "\n", sep="")
Number of longitudinal tau-PET scans with accompanying cognitive data: 576 Number of subjects in merged dataset: 243
As it turns out, only 576 of the original 593 tau-PET scans had corresponding cognitive assessments. This leaves 576 unique PET scan datapoints for 243 subjects.
Lastly, before I can perform outlier detection, I need to derive the longitudinal features upon which the prediction models will be built – namely, annual change in tau-PET SUVR and annual change in CDR-Sum of Boxes score.
# Calculate change in tau-PET SUVR and CDR-SoB over time, normalized to # of years elapsed
annual.changes <- full.df %>%
ungroup() %>%
select(-VISCODE2) %>%
# Reshape wide --> long
pivot_longer(cols=c(-RID, -EXAMDATE, -PTGENDER, -AGE), names_to="Metric",
values_to="Value") %>%
# Calculate number of years between each visit as well as change in SUVR or CDR-SoB
dplyr::group_by(RID, PTGENDER, AGE, Metric) %>%
dplyr::summarise(n_years = as.numeric((EXAMDATE - lag(EXAMDATE,
default=EXAMDATE[1]))/365),
change = Value - lag(Value, default=Value[1])) %>%
# Remove data points corresponding to first visit
filter(n_years > 0) %>%
# Calculate change in either tau-PET SUVR or CDR-SoB change per year
dplyr::mutate(Annual_Change = change/n_years) %>%
# Remove columns that are no longer needed
select(-n_years, -change) %>%
group_by(RID, Metric) %>%
# Assign row identifier with row_number()
dplyr::mutate(interval_num = row_number()) %>%
# Reshape from long --> wide
pivot_wider(., id_cols=c(RID, AGE, PTGENDER, interval_num), names_from=Metric,
values_from=Annual_Change) %>%
rename("Age_Baseline" = "AGE",
"Sex" = "PTGENDER")
annual.changes %>% select(RID, interval_num, amygdala, entorhinal, hippocampus) %>%
mutate_if(is.numeric, function(x) round(x, 4)) %>%
datatable()
Now that the datasets are merged, I can perform outlier detection. Given the multivariate nature of this dataset (i.e. multiple brain regions), I will use Cook’s Distance to estimate the relative influence of each data point in a simple multiple regression model.
# Calculate Cook's Distance using multiple regression of CDR-SoB annual change on tau-PET SUVR change for all 31 ROIs
cooks.distance <- cooks.distance(lm(CDRSB ~ . - RID - interval_num, data=annual.changes))
# Plot Cook's distance for each subject
p.cooks <- data.frame(CD=cooks.distance) %>%
rownames_to_column(var="Data_Point") %>%
mutate(Data_Point=as.numeric(Data_Point)) %>%
mutate(Label=ifelse(CD>30, Data_Point, NA_real_)) %>%
ggplot(data=., mapping=aes(x=Data_Point,y=CD)) +
geom_hline(yintercept = 4*mean(cooks.distance,na.rm=T), color="blue") +
geom_point() +
geom_text(aes(label=Label), nudge_y=1.5) +
ylab("Cook's Distance") +
xlab("annual.changes index") +
theme_minimal()
# Convert to interactive plotly visualization
ggplotly(p.cooks)
rm(p.cooks)
All but one data point have relatively low Cook’s distance values, while data point #224 has a relatively large Cook’s distance. This suggests large residuals and leverage associated with this datapoint, which could distort model fitting and accuracy. Upon further examination of this instance:
# Show data from row 224
as.data.frame(t(annual.changes[224,])) %>%
rownames_to_column(var="Variable") %>%
dplyr::rename("Value" = "V1") %>%
datatable()
This subject exhibits very large fluctuations in tau-PET SUVR values in several brain regions for this associated time interval. Given that SUVR values typically range from 0.75-2, changes of this large magnitude is surprising, and may certainly render this data point an outlier. Fortunately, the interval_num of 2 indicates that this is the second time interval for this subject, so omitting this interval doesn’t reduce the total number of subjects in the analysis. I will remove this data point:
# Omit row 224
annual.changes <- annual.changes[-224,]
I can now finish some aspects of data exploration that depended upon refining the subject cohort as well as the features. For starters, I will examine the distribution of annual tau change in each of the 31 ROIs:
# reshape tau-PET SUVR change ROI-wise data from wide --> long
annual.changes.tau <- annual.changes %>%
select(-CDRSB) %>%
ungroup() %>%
pivot_longer(cols=c(-RID, -interval_num, -Age_Baseline, -Sex), names_to="ROI",
values_to="deltaSUVR")
# Plot SUVR change distribution for each ROI using multi.hist from psych package
multi.hist(annual.changes %>% ungroup() %>% select(-RID, -interval_num, -CDRSB, -Age_Baseline, -Sex),
dcol="red")
The distribution looks reasonably normal for each ROI, and all of the curves peak around zero, suggesting all of the ROIs have a mean of ~0. Since there are both negative values and values of zero in these data, neither log nor square root transformation would be possible, anyway. Therefore, I will leave the variable distribution as-is.
Next, I will visualize the correlation in annual tau change between each of the ROIs measured:
# Select only tau-PET ROIs from annual change dataset
annual.roi <- annual.changes %>% ungroup() %>% select(-RID, -interval_num, -CDRSB, -Age_Baseline, -Sex)
# Calculate correlation matrix between all ROIs
roi.cor <- cor(annual.roi)
# Plot the correlation matrix using ggcorrplot, order by hierarchical clustering
ggcorrplot(roi.cor, hc.order = TRUE, outline.col = "white") %>%
# Convert to interactive correlogram with plotly
ggplotly(width=800, height=700)
As it turns out, all ROIs show positive correlations in the annual rate of change in tau-PET uptake, with the exception of three ROI pairs:
These are very weak correlations, and can be further visualized with scatter plots:
# Highlight the ROIs noted above with (slightly) negative correlations
# Use ggpairs function from GGally to visualize their pairwise distributions
p.select.rois <- ggpairs(annual.roi %>% select(entorhinal, pericalcarine,
transversetemporal,
lateraloccipital),
# Add regression line to scatter plots
lower = list(continuous = wrap("smooth", se=F,
method = "lm",
color="lightslategray",
alpha=0.4))) +
theme_minimal()
# Convert to interactive pair plot with plotly
ggplotly(p.select.rois, width=700, height=600)
These negative correlations are indeed weak and mostly noise, based on the scatter plots. Regarding the other positive correlations, I am curious as to whether there are underlying trends based on either spatial proximity and/or tau progression in AD, based on cortical lobe and Braak regions, respectively:
# Convert correlation matrix from wide --> long, each row will detail one pairwise correlation
roi.cor.long <- as.data.frame(roi.cor) %>%
rownames_to_column(var="ROI1") %>%
pivot_longer(cols=c(-ROI1), names_to="ROI2", values_to="Pearson_Corr") %>%
# Exclude rows where the two ROIs are the same, where correlation is always 1
filter(ROI1 != ROI2) %>%
# Join with Braak-stage stratification dataframe by the first ROI column
left_join(., roi.braak, by=c("ROI1"="Base_Name")) %>%
select(-ROI_Name, -Hemisphere) %>%
# Specify that Braak + Cortex info pertain to the first ROI column, not the second
dplyr::rename("ROI1_Braak" = "Braak", "ROI1_Cortex" = "Cortex") %>%
# Merge again with Braak-stage stratification, this time by the second ROI column
left_join(., roi.braak, by=c("ROI2" = "Base_Name")) %>%
select(-ROI_Name, -Hemisphere) %>%
dplyr::rename("ROI2_Braak" = "Braak", "ROI2_Cortex" = "Cortex") %>%
# Rename Insula as Ins for visualization purpose
mutate_at(c("ROI1_Cortex", "ROI2_Cortex"), function(x) ifelse(x=="Insula", "Ins", x))
# Plot the correlation by Cortical Lobe
p.cor.cortical <- roi.cor.long %>%
ggplot(., mapping=aes(x=ROI1, y=ROI2)) +
# Heatmap, fill squares by pearson correlation coefficient
geom_tile(mapping=aes(fill=Pearson_Corr)) +
labs(fill="Pearson Coefficient") +
theme_minimal() +
# Facet by cortical lobes
facet_grid(ROI2_Cortex ~ ROI1_Cortex, scales="free",
space="free", switch="both") +
ggtitle("Correlation in Annual Tau SUVR Change by Cortical Lobe") +
theme(axis.title=element_blank(),
axis.text.y = element_text(size=11),
axis.text.x = element_text(angle=90, size=11, hjust=1),
panel.border = element_blank(),
panel.grid = element_blank(),
plot.title=element_text(hjust=0.5)) +
theme(strip.placement = "outside") +
scale_fill_gradient2(low="#210DFC", mid="white", high="#FF171B",
limits=c(-1,1))
# Save correlation matrix to PNG and print -- dimensions were bizarre if I plotted directly from ggplot
ggsave("ROI_Correlation_Cortical.png", plot=p.cor.cortical, width=9, height=7.5, units="in", dpi=300)
include_graphics("ROI_Correlation_Cortical.png")
There are somewhat stronger inter-correlations within the frontal and parietal cortices compared with other cortical lobes. Now stratifying based on ROI Braak stage:
# Plot the correlation by Braak Stage
p.cor.braak <- roi.cor.long %>%
ggplot(., mapping=aes(x=ROI1, y=ROI2)) +
# Heatmap, fill squares by pearson correlation coefficient
geom_tile(mapping=aes(fill=Pearson_Corr)) +
labs(fill="Pearson Coefficient") +
theme_minimal() +
# Facet by Braak stage
facet_grid(ROI2_Braak ~ ROI1_Braak, scales="free",
space="free", switch="both") +
ggtitle("Correlation in Annual Tau SUVR Change by Braak Stage") +
theme(axis.title=element_blank(),
axis.text.y = element_text(size=11),
axis.text.x = element_text(angle=90, size=11, hjust=1),
panel.border = element_blank(),
panel.grid = element_blank()) +
theme(strip.placement = "outside") +
scale_fill_gradient2(low="#210DFC", mid="white", high="#FF171B",
limits=c(-1,1))
# Save correlation matrix to PNG and print -- dimensions were bizarre if I plotted directly from ggplot
ggsave("ROI_Correlation_Braak.png", plot=p.cor.braak, width=9, height=7.5, units="in", dpi=300)
include_graphics("ROI_Correlation_Braak.png")
This generally high correlation in annual tau-PET SUVR changes between cortical regions may pose a challenge when it comes to modeling, due to feature collinearity.
While I want to keep each region distinct for biological context, I will also reduce the dimensionality of the data using principal component analysis (PCA), which has an added benefit of yielding orthogonal un-correlated components to serve as input for the modeling phase. Since all the variables are in the same unit (i.e. change in SUVR per year), I will only need to center the data, not scale it.
# Convert dataframe to numerical matrix
pca.df <- as.matrix(annual.changes %>% ungroup() %>% select(-RID, -CDRSB, -interval_num, -Age_Baseline, -Sex))
# Perform PCA with prcomp() from stats package
# Center data but don't scale, as all columns are in same units (change in SUVR per year)
res.pca <- prcomp(pca.df, center=T, scale.=F)
# The variable info can be extracted as follows:
var <- get_pca_var(res.pca)
The proportion of variance explained by each principal component (PC) can be visualized using a Scree plot:
# Calculate cumulative proportion of variance explained
cumpro <- cumsum(res.pca$sdev^2 / sum(res.pca$sdev^2)*100)
# Calculate individual proportion of variance explained by each principal component
variances <- data.frame((res.pca$sdev^2/sum(res.pca$sdev^2))*100)
# Label PCs
variances$PC <- c(1:31)
# Add in cumulative proportion of variance to dataframe
variances$cumpro <- cumpro
# Establish column names
colnames(variances) <- c("Variance_Proportion", "PC", "CumVar")
# Create a new row just for visualization purpose, helps with axis structure
newrow <- subset(variances, PC == 31)
newrow$PC <- 31.5
variances <- plyr::rbind.fill(variances, newrow)
# Set individual variance line as maroon, cumulative variance line as green
linecolors <- c("Component Variance" = "maroon4",
"Cumulative Variance" = "green4")
# Plot the customized Scree plot
p.var <- variances %>%
ggplot(data=.) +
# Add a bar for each principal component, showing individual proportion of variance
geom_bar(data=subset(variances, PC < 32), mapping=aes(x=PC, y=Variance_Proportion),
stat="identity", fill="steelblue") +
# Add line for individual component variance
geom_line(aes(color="Component Variance", x=PC, y=Variance_Proportion),
size=0.7, data=subset(variances, PC < 31.1), show.legend=F) +
# Add line for cumulative variance explained with each additional principal component
geom_line(aes(x=PC, y=CumVar, color="Cumulative Variance"), size=0.7,
data=subset(variances, PC < 32)) +
geom_point(aes(x=PC, y=Variance_Proportion),data=subset(variances, PC < 31.1), size=1.5) +
# Set line colors as defined above
scale_colour_manual(name="",values=linecolors,
guide = guide_legend(override.aes=list(size=2))) +
theme_minimal() +
ylab("Percentage of Variance Explained") +
xlab("Principal Component") +
ggtitle("Principal Components\nContribution to Subject Variance") +
# Manually define x-axis and y-axis limits so line aligns with bar plot
xlim(c(0.5, 31.5)) +
scale_y_continuous(breaks=seq(0, 100, 10),
sec.axis = dup_axis(name="")) +
theme(plot.title = element_text(hjust=0.5, size=14),
axis.text = element_text(size=12),
panel.grid = element_blank(),
legend.position="bottom",
legend.text = element_text(size=12))
# Convert to plotly interactive visualization
ggplotly(p.var, tooltip=c("x","y")) %>%
layout(legend = list(orientation = "h", y=-0.2))
The first five principal components (PCs) collectively explain 77.2% of variance in the data; beyond these components, there are only marginal increases in the cumulative variance explained. Therefore, I will move forward with these first five PCs.
Individual ROI contributions (loadings) per component can be extracted:
# variable loadings are stored in the $rotation vector of prcomp output
loadings_wide <- data.frame(res.pca$rotation) %>%
# add rownames as column
cbind(ROI=rownames(.), .) %>%
# remove original row names
remove_rownames() %>%
# Select ROI and first five PCs
select(ROI:PC5) %>%
rowwise() %>%
# Join with Braak stage stratification dataframe
left_join(., roi.braak, by=c("ROI"="Base_Name")) %>%
select(ROI, Cortex, Braak, PC1:PC5) %>%
distinct()
# Print interactive datatable
datatable(loadings_wide %>% mutate_if(is.numeric, function(x) round(x,4)))
I’m curious as to whether ROIs exhibit similar covariance in annual tau-PET changes based on spatial proximity (i.e. cortical region) and/or similar Alzheimer’s Disease progression (i.e. Braak stage).
# Plot loadings colored by cortical lobe
p.cortex <- loadings_wide %>%
ggplot(data=., mapping=aes(x=PC1, y=PC2, label=ROI)) +
geom_hline(yintercept=0, linetype=2, alpha=0.5) +
geom_vline(xintercept=0, linetype=2, alpha=0.5) +
geom_point(aes(color=Cortex), size=3) +
theme_minimal() +
xlab("PC1 (41.6% Variance)") +
ylab("PC2 (14.0% Variance)") +
ggtitle("ROI PC Loadings by Cortical Region") +
theme(plot.title=element_text(hjust=0.5))
# Convert to interactive scatterplot with plotly
ggplotly(p.cortex, tooltip=c("label", "x", "y"))
rm(p.cortex)
The first note is that all of the ROIs exhibit a negative loading (correlation) with PC1. Beyond that, all of the occipital and parietal cortex ROIs are positively correlated with PC2, while the insula, temporal cortex, and cingulate cortex ROIs are all negatively correlated with PC2. The frontal cortex ROIs are right on the border of PC2, low correlations in both directions.
# Plot PC loadings colored by Braak stage
p.braak <- loadings_wide %>%
ggplot(data=., mapping=aes(x=PC1, y=PC2, label=ROI)) +
geom_hline(yintercept=0, linetype=2, alpha=0.5) +
geom_vline(xintercept=0, linetype=2, alpha=0.5) +
geom_point(aes(color=Braak), size=3) +
theme_minimal() +
xlab("PC1 (41.6% Variance)") +
ylab("PC2 (14.0% Variance)") +
ggtitle("ROI PC Loadings by Braak Stage") +
theme(plot.title=element_text(hjust=0.5))
# Convert to interactive scatterplot with plotly
ggplotly(p.braak, tooltip=c("label", "x", "y"))
rm(p.braak)
There is not as clear a distinction to be made based on ROI Braak stage. One observation that does stand out is that all of the Braak VI ROIs are relatively close in the upper right of the points. Beyond that, the Braak stages are mixed in this loading plot.
Moving on, the subject and time interval info can be linked with the PCA results:
# post.pca = subject identification info + first five PCs
post.pca <- as.data.frame(res.pca$x[,1:5]) %>%
cbind(., RID=annual.changes$RID) %>%
cbind(., interval_num=annual.changes$interval_num) %>%
cbind(., CDRSB=annual.changes$CDRSB) %>%
cbind(., Age_Baseline=annual.changes$Age_Baseline) %>%
cbind(., Sex=annual.changes$Sex) %>%
select(RID, interval_num, Age_Baseline, Sex, CDRSB, PC1:PC5)
# print interactive datatable
datatable(post.pca %>% mutate_if(is.numeric, function(x) round(x,5)) %>% select(-Age_Baseline, -Sex))
The final step is to convert sex into a dummy variable for modeling:
# Encode sex as a binary variable for Sex_Male
annual.changes$Sex_Male <- ifelse(annual.changes$Sex=="Male", 1, 0)
post.pca$Sex_Male <- ifelse(post.pca$Sex=="Male", 1, 0)
# Remove original Sex feature
annual.changes <- annual.changes %>% select(-Sex)
post.pca <- post.pca %>% select(-Sex)
I’ll save these prepared datasets to an .RData file for modeling:
save(annual.changes, post.pca, file="../RData/Prepared_Data.RData")
Note: I am not including the RData in my GitHub repo as it contains real ADNI data. The data contained within the RData files can be recreated using the simulated datasets by going through the code chunks in this script.